import sys
import numpy as np
import pandas as pd
import scanpy as sc
import matplotlib.pyplot as plt
import seaborn as sns
sc.settings.set_figure_params(dpi=120)
sns.set_style("dark")
plt.rcParams["pdf.fonttype"] = 42
from sklearn.svm import SVR
def filter_cv_vs_mean(S: np.ndarray, N: int, svr_gamma: float=None, plot: bool=True, min_expr_cells: int=2,
max_expr_avg: float=20, min_expr_avg: float=0) -> np.ndarray:
muS = S.mean(1)
detected_bool = ((S > 0).sum(1) > min_expr_cells) & (muS < max_expr_avg) & (muS > min_expr_avg)
Sf = S[detected_bool, :]
mu = Sf.mean(1)
sigma = Sf.std(1, ddof=1)
cv = sigma / mu
log_m = np.log2(mu)
log_cv = np.log2(cv)
if svr_gamma is None:
svr_gamma = 150. / len(mu)
svr = SVR(gamma=svr_gamma)
svr.fit(log_m[:, None], log_cv)
fitted_fun = svr.predict
ff = fitted_fun(log_m[:, None])
score = log_cv - ff
xnew = np.linspace(np.min(log_m), np.max(log_m))
ynew = svr.predict(xnew[:, None])
nth_score = np.sort(score)[::-1][N]
if plot:
plt.scatter(log_m[score > nth_score], log_cv[score > nth_score], s=3, alpha=0.4, c="tab:red")
plt.scatter(log_m[score <= nth_score], log_cv[score <= nth_score], s=3, alpha=0.4, c="tab:blue")
mu_linspace = np.linspace(np.min(log_m), np.max(log_m))
plt.plot(mu_linspace, fitted_fun(mu_linspace[:, None]), c="k")
plt.xlabel("log2 mean S")
plt.ylabel("log2 CV S")
cv_mean_score = np.zeros(detected_bool.shape)
cv_mean_score[~detected_bool] = np.min(score) - 1e-16
cv_mean_score[detected_bool] = score
cv_mean_selected = cv_mean_score >= nth_score
return cv_mean_selected
adata1 = sc.read_10x_mtx("HS980_D60_Rep2_Unsorted/", cache=True)
adata2 = sc.read_10x_mtx("HS980_D60_NCAM1-High-sorted/", cache=True)
adata3 = sc.read_10x_mtx("HS980_D60_CD140b-High-sorted/", cache=True)
adata = adata1.concatenate([adata2, adata3])
adata.obs["batch"].value_counts()
adata.obs['n_counts'] = adata.X.sum(axis=1)
n, bins, *x = plt.hist(adata.obs['n_counts'], bins=100)
plt.xlabel("Number of UMIs")
plt.ylabel("Number of cells")
plt.axvline(4000, c="r")
plt.axvline(25000, c="r")
plt.show()
sc.pp.filter_cells(adata, min_counts=4000)
sc.pp.filter_cells(adata, max_counts=25000)
adata
adata.obs["batch"].value_counts()
sc.pl.highest_expr_genes(adata, n_top=20)
expressed_genes = np.sum(adata.X > 0, 1)
adata.obs['n_genes'] = expressed_genes
len(expressed_genes)
n, bins, *x = plt.hist(expressed_genes, bins=100)
plt.axvline(1000, c="r")
plt.axvline(6000, c="r")
plt.xlabel("Number of Genes")
plt.ylabel("Number of Cells")
plt.show()
adata = adata[adata.obs['n_genes'] > 1000, :].copy()
adata = adata[adata.obs['n_genes'] < 6000, :].copy()
mito_genes = adata.var_names.str.startswith('MT-')
adata.obs['percent_mito'] = np.sum(
adata[:, mito_genes].X, axis=1) / np.sum(adata.X, axis=1)
adata.obs['n_counts'] = adata.X.sum(axis=1)
adata
sc.pl.violin(adata, ['n_genes', 'n_counts', 'percent_mito'],
jitter=0.4, multi_panel=True)
sc.pl.scatter(adata, x='n_counts', y='percent_mito')
adata = adata[adata.obs['percent_mito'] < 0.20].copy()
adata.obs["batch"].value_counts()
adata_raw = adata.copy()
adata
sc.pp.filter_genes(adata, min_cells=20)
sc.pp.normalize_total(adata)
adata.raw = adata
S_genes_hum = ["MCM5", "PCNA", "TYMS", "FEN1", "MCM2", "MCM4", "RRM1", "UNG", "GINS2",
"MCM6", "CDCA7", "DTL", "PRIM1", "UHRF1", "CENPU", "HELLS", "RFC2",
"RPA2", "NASP", "RAD51AP1", "GMNN", "WDR76", "SLBP", "CCNE2", "UBR7",
"POLD3", "MSH2", "ATAD2", "RAD51", "RRM2", "CDC45", "CDC6", "EXO1", "TIPIN",
"DSCC1", "BLM", "CASP8AP2", "USP1", "CLSPN", "POLA1", "CHAF1B", "BRIP1", "E2F8"]
G2M_genes_hum = ["HMGB2", "CDK1", "NUSAP1", "UBE2C", "BIRC5", "TPX2", "TOP2A", "NDC80",
"CKS2", "NUF2", "CKS1B", "MKI67", "TMPO", "CENPF", "TACC3", "PIMREG",
"SMC4", "CCNB2", "CKAP2L", "CKAP2", "AURKB", "BUB1", "KIF11", "ANP32E",
"TUBB4B", "GTSE1", "KIF20B", "HJURP", "CDCA3", "JPT1", "CDC20", "TTK",
"CDC25C", "KIF2C", "RANGAP1", "NCAPD2", "DLGAP5", "CDCA2", "CDCA8", "ECT2",
"KIF23", "HMMR", "AURKA", "PSRC1", "ANLN", "LBR", "CKAP5", "CENPE",
"CTCF", "NEK2", "G2E3", "GAS2L3", "CBX5", "CENPA"]
sc.tl.score_genes_cell_cycle(adata, s_genes=S_genes_hum, g2m_genes=G2M_genes_hum)
cv_vs_mean_keep = filter_cv_vs_mean(adata.X.toarray().T, N=2000, max_expr_avg=50)
sc.pp.log1p(adata)
adata = adata[:, cv_vs_mean_keep].copy()
sc.pp.regress_out(adata, ['S_score'])
sc.pp.regress_out(adata, ['G2M_score'])
sc.pp.scale(adata, max_value=10)
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca(adata, color=['BEST1', 'MITF', 'PDGFRB', 'NCAM1', 'batch'])
sc.pl.pca_variance_ratio(adata, log=True)
sc.pp.neighbors(adata, n_neighbors=30, n_pcs=15)
sc.tl.umap(adata, alpha=0.3, min_dist=0.5, random_state=0)
sc.tl.louvain(adata, resolution=1)
sc.pl.umap(adata, use_raw=True, color=['louvain', "phase", 'batch'], ncols=3)
em = adata.obsm["X_umap"]
plt.scatter(em[:, 0], em[:, 1], s=5)
plt.axvline(2)
plt.axhline(-2.7)
nl = []
for i, x in zip(adata.obs["louvain"], em[:, 0]):
if x<2:
nl.append("9")
else:
nl.append(i)
adata.obs["louvain"] = nl
sc.pl.umap(adata, color='louvain')
nl = []
for i, x in zip(adata.obs["louvain"], em[:, 0]):
if x>10 and i=="5":
nl.append("10")
else:
nl.append(i)
adata.obs["louvain"] = nl
sc.pl.umap(adata, color='louvain')
nl = []
for i, y in zip(adata.obs["louvain"], em[:, 1]):
if y<-2.7 and i=="6":
nl.append("11")
elif y<2.5 and i=="1":
nl.append("7")
else:
nl.append(i)
adata.obs["louvain"] = nl
sc.pl.umap(adata, color='louvain')
adata_raw.obsm["X_umap"] = adata.obsm["X_umap"]
adata_raw.obs["louvain"] = adata.obs["louvain"]
adata_raw_norm = adata_raw.copy()
sc.pp.normalize_total(adata_raw_norm)
sc.pp.log1p(adata_raw_norm)
batch2sample = {"0":"Unsorted", "1":"NCAM1-High", "2":"CD140b-High"}
adata_raw_norm.obs['sample'] = [batch2sample[i] for i in adata_raw_norm.obs['batch']]
adata_raw.obs['sample'] = [batch2sample[i] for i in adata_raw.obs['batch']]
adata.obs['sample'] = [batch2sample[i] for i in adata.obs['batch']]
sc.pl.umap(adata_raw_norm, use_raw=False, color=["NCAM1", "PDGFRB", "sample"], ncols=4)
sc.pl.umap(adata_raw_norm, use_raw=False, color=["MITF", "TYRP1", "SERPINF1", "DCT", "SFRP5",
"RLBP1", "BEST1", "RPE65", "TTR", "sample"], ncols=5)
sc.pl.umap(adata_raw_norm, use_raw=False, color=["RAX", "VSX2", "CRABP1", "SIX3", "SOX2", "PAX6",
"sample"], ncols=5)
louvain2name = {'0': "Late RPE",
'1': "Late RPE",
'10': "Late RPE",
'11': "Neural Retina Progenitors",
'2': "Mid RPE",
'3': "Late RPE", '4': "Late RPE", '5': "Mid RPE", '6': "Retinal Progenitors",
'7': "Early RPE",
'8': "EMT RPE", '9': "Muscle"}
adata_raw_norm.obs["cell_type"] = [louvain2name[i] for i in adata_raw_norm.obs["louvain"]]
adata_raw.obs["cell_type"] = [louvain2name[i] for i in adata_raw.obs["louvain"]]
sc.pl.umap(adata_raw_norm, use_raw=False, color=["cell_type", "sample"], wspace=0.6, ncols=4)
em = adata_raw_norm.obsm["X_umap"]
n2c = {"Unsorted":"grey", "NCAM1-High":"#ff40ff", "CD140b-High":"#0432ff"}
plt.scatter(em[:, 0], em[:, 1], c=[n2c[i] for i in adata_raw_norm.obs["sample"]], s=5, alpha=0.5, rasterized=True)
plt.axis("off")
n2n = {'EMT RPE':'#1f77b4',
'Early RPE': '#ff7f0e',
'Late RPE': '#279e68',
'Mid RPE': '#d62728',
'Muscle': '#aa40fc',
'Neural Retina Progenitors': '#8c564b',
'Neuronal':'#e377c2',
'Retinal Progenitors':'#b5bd61'}
plt.scatter(em[:, 0], em[:, 1], c=[n2n[i] for i in adata_raw_norm.obs["cell_type"]],
s=5, alpha=0.5, rasterized=False)
plt.axis("off")
plt.show()
adata_raw_norm.obs["sample"].value_counts()
df = adata_raw_norm.obs
df.groupby(["cell_type", "sample"])["sample"].value_counts()/df.groupby(["sample"])["sample"].value_counts()*100
import matplotlib
x = [0]*7 + [1]*7 + [2]*7
y = [1, 2, 3, 4, 5, 6, 7] * 3
s = [2.531646, 14.285714, 13.562387, 15.099458, 49.909584, 0.994575, 3.616637]
s += [0.101317, 0, 0.202634, 27.456940, 69.604863, 0.405268, 2.228977]
s += [1.538462, 5.025641, 2.461538, 24.923077, 63.487179, 0.820513, 1.743590]
c = ['#8c564b', '#b5bd61', '#ff7f0e', '#d62728', '#279e68', '#aa40fc', '#1f77b4'] * 3
xticks = ["NCAM1-High", "CD140b-High", "Unsorted"]
yticks = ["NeuralRet", "RetProg", "EarlyRPE", "MidRPE", "LateRPE", "Muscle", "EMT-RPE"]
sns.set_style("white")
plt.scatter(x[::-1]+[3]*6, y+[1, 2, 3, 4, 5, 6], s=np.array(s[::-1]+[1, 10, 25, 50, 75, 100])*5, c=c+["black"]*6)
plt.xticks(range(0, 3), xticks, rotation=90)
plt.yticks(range(1, 8), yticks[::-1])
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.gca().spines['bottom'].set_visible(False)
plt.gca().spines['left'].set_visible(False)
plt.xlim(-1, 5)
plt.show()
a_tmp = adata_raw_norm.copy()
a_tmp = a_tmp[a_tmp.obs["cell_type"].isin(["Late RPE"])]
a_tmp
from miscalg import enrichment
t2t = {"Unsorted":0, "NCAM1-High":1, "CD140b-High":2}
Xgen = a_tmp.X.toarray().T
enrichment_score = enrichment.enrichment_score(Xgen, np.array([t2t[i] for i in a_tmp.obs["sample"]]))
emarkers = enrichment.extract_enriched(enrichment_score,
np.array(a_tmp.var.index), n_enriched=10)
pd.DataFrame(emarkers).head(20)
markers = ["MITF", "TYRP1", "PMEL", "SERPINF1", "DCT", "ELN",
"RLBP1", "BEST1", "RPE65", "RGR", "TTR", "SFRP5"]
sc.pl.dotplot(a_tmp, markers, groupby='sample',
dendrogram=False)
n2i = {'EMT RPE': 2,
'Early RPE': 3,
'Late RPE': 5,
'Mid RPE': 4,
'Muscle': 6,
'Neural Retina Progenitors': 0,
'Retinal Progenitors': 1}
from miscalg import enrichment
Xgen = adata_raw.X.toarray().T
enrichment_score = enrichment.enrichment_score(Xgen, np.array([n2i[i] for i in adata_raw.obs["cell_type"]]))
emarkers = enrichment.extract_enriched(enrichment_score,
np.array(adata_raw.var.index), n_enriched=20)
pd.DataFrame(emarkers)
genes = ["STMN2", "NEUROD4", "NEUROD1", "RCVRN", "ONECUT1", "ATOH7",
"SFRP2", "HES5", "SOX2", "VSX2", "CRB1", "FEZF2",
"ACTA2", "ZIC1", "TAGLN", "LYPD1", "COL4A1", "PDGFRA",
"TMEFF2", "MITF", "DCT", "TYRP1", "PMEL", "TYR", "RLBP1", "BEST1", "RBP1",
"RPE65", "TTR", "RGR", "SLC6A13", "SFRP5",
"MYL1", "MYH3", "ACTA1", "TNNI2", "TNNC2"]
X = pd.DataFrame(adata_raw_norm[:, genes].X.toarray())
X.index = adata_raw_norm.obs["cell_type"]
X.columns = genes
X = X.groupby(X.index).mean()
X = X.loc[["Neural Retina Progenitors", "Retinal Progenitors", "EMT RPE",
"Early RPE", "Mid RPE", "Late RPE", "Muscle"]]
plt.figure(None, (12, 4))
sns.heatmap((X - X.min()) / (X.max() - X.min()), cmap='inferno')
plt.show()